\documentclass[10pt,a4paper]{article} 

\usepackage{ctex} % 中文支持
\usepackage[top=2.5cm, bottom=2.5cm, left=2.5cm, right=2.5cm]{geometry} % 页边距
\usepackage{amsmath, amssymb} % 数学公式与符号
\usepackage{graphicx}
\usepackage{pythonhighlight}
\usepackage{url} 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\usepackage{titling}
\setlength{\droptitle}{-2cm} % 标题上移

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%文档的题目、作者与日期
\author{五六七 }
\title{盐矿的判别分类 }

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{document}

\maketitle

\begin{abstract}
根据一些已知盐矿类别的样本数据，和一些未知盐矿的观测数据，对未知盐矿进行判别分析。
\end{abstract}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\setcounter{tocdepth}{2}
%\renewcommand\contentsname{目录}
%
%\renewcommand {\baselinestretch} {1.3}\normalsize 
%\tableofcontents 
%\renewcommand {\baselinestretch} {1.0}\normalsize


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{问题描述}
已知云南某地的一些盐矿氛围钾盐和钠盐两种类别。我们已经掌握的两类盐矿的历史样本数据如表所示。
并测得一些未知盐矿的样本数据。请对代判样本进行判别分析。

\begin{table}[ht!]\centering
\caption{一些盐矿的样本数据和分类信息} \vspace{0.2cm}
\begin{tabular}{|c|c|c|c|c|c|}\hline
类别 & 编号 & $x_1$ &  $x_2$ &  $x_3$ &  $x_4$  \\ \hline  \hline  
钾盐&1 &13.85&	2.79&	7.8&		49.6  \\ \hline  
钾盐&2 &22.31&	4.67&	12.31&	47.8  \\ \hline  
钾盐&3 &28.82&	4.63&	16.18&	62.15  \\ \hline  
钾盐&4 &15.29&	3.54&	7.58&	43.2  \\ \hline  
钾盐&5 &28.29&	4.9&		16.12&	58.7  \\ \hline  \hline  
钠盐&1 &2.18&		1.06&	1.22&	20.6  \\ \hline  
钠盐&2 &3.85&		0.8&		4.06&	47.1  \\ \hline  
钠盐&3 &11.4&		0&		3.5&		0  \\ \hline  
钠盐&4 &3.66&		2.42&	2.14&	15.1  \\ \hline  
钠盐&5 &12.1&		0&		5.68&	0  \\ \hline  \hline  
待判&1 &8.85&		3.38&	5.17&	26.1  \\ \hline  
待判&2 &28.6&		2.4&		1.2&		127  \\ \hline  
待判&3 &20.7&		6.7&		7.6&		30.8  \\ \hline  
待判&4 &7.9&		2.4&		4.3&		33.2  \\ \hline  
待判&5 &3.19&		3.2&		1.43&	9.9  \\ \hline  
待判&6 &12.4&		5.1&		4.48&	24.6  \\ \hline  
\end{tabular}
\end{table}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{建立模型}

我们使用 Fisher 判别法。基本思想是将多维数据 $X=(x_1,x_2,x_3,x_4)^T$ 通过线性组合得到一维数据
\begin{eqnarray}
y=a^TX = \begin{bmatrix} a_1 & a_2 & a_3 & a_4 \end{bmatrix}\cdot \begin{bmatrix} x_1\\ x_2 \\ x_3 \\ x_4 \end{bmatrix}
= a_1x_1 + a_2x_2 + a_3x_3 + a_4x_4. 
\end{eqnarray}

对两组已知分类的样品数据 $G_1$ 和 $G_2$,  分别计算线性组合的均值，
\begin{eqnarray}
\mu_{y_1} &=& \mathbb{E}(y \mid y= a^TX, X\in G_1) = a^T\mu_1, \\ 
\mu_{y_2} &=& \mathbb{E}(y \mid y= a^TX, X\in G_2) = a^T\mu_2.  
\end{eqnarray}
其中 $\mu_1$ 和 $\mu_2$ 是两组样品的均值向量。计算线性组合得到的一维数据 $y$ 的方差为
\begin{eqnarray}
\sigma_y^2 = \text{Var}(y) = \text{cov}(a^TX, a^TX) = a^T \text{cov}(X,X) a = a^T \Sigma a,   
\end{eqnarray}
其中 $\Sigma$ 是已知分类的两组样品数据 $G_1\cup G_2$ 的协方差矩阵。
%
考虑两组数据的均值差的平方与两组数据的方差的比例，这是一个无量纲的量，预期能够最大程度地刻画两类数据的不同之处，
\begin{eqnarray}
\frac{ ( \mu_{y_1} - \mu_{y_2} )^2 }{\sigma_y^2} = \frac{ [a^T(\mu_1 - \mu_2)]^2 }{a^T\Sigma a}.
\end{eqnarray}
Fisher 判别的思想是取组合系数 $a$, 使得上述比例最大。然后用线性组合 $y=a^TX$ 对未知分类的样品进行分类判别。
数学证明可知，当 $a=\Sigma^{-1}(\mu_1-\mu_2)$ 时，上述均值平方与方差的比例达到最大。因此定义判别函数为 
\begin{eqnarray}
W &=& (\mu_1-\mu_2)^T \Sigma^{-1} X - \frac{1}{2}(\mu_{y_1}+\mu_{y_2}) \\ 
&=& (\mu_1-\mu_2)^T \Sigma^{-1} X - \frac{1}{2}a^T(\mu_1+\mu_2) \\ 
&=& (\mu_1-\mu_2)^T \Sigma^{-1} X - \frac{1}{2}(\mu_1-\mu_2)^T \Sigma^{-1} (\mu_1+\mu_2) \\ 
&=& (\mu_1-\mu_2)^T \Sigma^{-1} \left[ X - \frac{1}{2} (\mu_1+\mu_2) \right]. 
\end{eqnarray}
 则有如下判别规则。
 \begin{enumerate}
\item  当 $W(X)\ge 0$ 时，判别 $X$ 为第一类；
\item  当 $W(X)< 0$ 时，判别 $X$ 为第二类。
\end{enumerate}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{编程计算}

首先载入数值计算包 \verb+numpy+ 和机器学习包 sklearn 的判别分析小包 \verb+discriminant_analysis+. 
\begin{python}
import numpy as np
from numpy.linalg import inv
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
\end{python}

然后载入数据，将矩阵的前5行保存为变量 \verb+a1+, 这是已知判别为钾盐的5个样品。
中间5行保存为变量 \verb+a2+, 这是已知判别为钠盐的5个样品。
其余行保存为变量 \verb+x+, 这是6个待判样品。
\begin{python}
a = np.loadtxt('data11_11.txt')
a1 = a[:5, :]
a2 = a[5:10, :]
x = a[10:, :]
\end{python}

计算已知判别的10个样品的协方差阵，然后计算协方差阵的逆阵。
\begin{python}
V = np.cov(a[:10, :].T, ddof=1) 
VI = inv(V)
\end{python}

分别计算已知判别的两组样品的各个指标的均值。
\begin{python}
mu1 = a1.mean(axis=0)
mu2 = a2.mean(axis=0)
\end{python}

计算判别函数的系数向量，和判别函数的常数项。
\begin{python}
k = VI @ (mu1-mu2)  
b = -(mu1+mu2) @ VI @ (mu1-mu2)/2  
\end{python}

对待判样品，计算判别函数的值，并打印。
\begin{python}
val = x @ k + b  
print('The values of the discriminant function; ', val)
\end{python}

得到判别函数的值如下。 
\begin{python}
[-0.55866097 17.18438806  4.21819717 -0.2045517  -1.21740648  2.05665806]
\end{python}

如果判别函数的值大于零，那么判别为A类（钾盐）；如果判别函数的值小于零，那么判别为B类（钠盐）。
输出按照判别法手工直接计算的判别结果。
\begin{python}
d = {0:'B', 1:'A'}
print('The result: ', [d[e>0] for e in val])
\end{python}

直接计算结果如下。
\begin{python}
['B', 'A', 'A', 'B', 'B', 'A']
\end{python}

为了直接使用库函数，对前10个样品定义判别结果向量，前5个为1，后5个为0. 
\begin{python}
y0 = np.hstack([np.ones(5), np.zeros(5)])
\end{python}

使用库函数 LDA, 将判别结果保存为变量 \verb+md+.
\begin{python}
md = LDA().fit(a[:10, :], y0) 
\end{python}

提取判别结果的各项内容：判别函数的系数和截距项。
\begin{python}
k2 = md.coef_
b2 = md.intercept_ 
\end{python}

验证手工计算和调用库函数的结果一样。
\begin{python}
c = b2/b
check = k * c 
\end{python}

使用库函数 LDA 得到的判别模型来预测未知样品的分类，并打印分类结果。
\begin{python}
val2 = md.predict(x)
print('The result by LDA is: ',[d[e] for e in val2])
\end{python}

分类结果如下。   
\begin{python}
The result by LDA is: ['B', 'A', 'A', 'B', 'B', 'A']
\end{python}

打印两种方法得到的判别函数的系数和截距项。
\begin{python}
print('k=',k, ',b=',b);
print('k2=',k2, ',b2=',b2)
\end{python}

结果如下。
\begin{python}
k= [ 0.42911168  0.38128565 -0.77367551  0.06511657] ,b= -3.34468485581427
k2= [[ 4.90880912  4.36170482 -8.85043582  0.74489885]] ,b2= [-38.26141377]
\end{python}

打印两种方法得到的判别函数的比例。打印判别函数对已知样本的误判率。
\begin{python}
print('The proportion is: ', c)
print('The error rate is: ', 1-md.score(a[:10, :], y0))
\end{python}

结果如下。
\begin{python}
The proportion is: [11.43946752]
The error rate is: 0.0
\end{python}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{检验模型}
将已知分类的10个样品代入判别函数，可得模型的分类全部是正确的。


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage

\section{回答问题}
将未知分类的6个样品代入判别函数，并根据判别规则，可得预测分类如下。
\begin{table}[ht!]\centering
\caption{未知样本的指标数据和预测分类} \vspace{0.2cm}
\begin{tabular}{|c|c|c|c|c|c|}\hline
类别 & 编号 & $x_1$ &  $x_2$ &  $x_3$ &  $x_4$  \\ \hline  \hline  
钠盐&1 &8.85&		3.38&	5.17&	26.1  \\ \hline  
钾盐&2 &28.6&		2.4&		1.2&		127  \\ \hline  
钾盐&3 &20.7&		6.7&		7.6&		30.8  \\ \hline  
钠盐&4 &7.9&		2.4&		4.3&		33.2  \\ \hline  
钠盐&5 &3.19&		3.2&		1.43&	9.9  \\ \hline  
钾盐&6 &12.4&		5.1&		4.48&	24.6  \\ \hline  
\end{tabular}
\end{table}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{子空间视图}

指标数据是四维的，可以投影到任意的二维子空间，并画出图像。

\begin{figure}[ht!]\centering
\includegraphics [height=8cm, width=12cm]{projection_four_subplots.png}
\caption{数据在不同的二维子空间中的图像 }
\end{figure}

上述图中蓝色的是钾盐的样品指标数据，红色的是钠盐的样品指标数据。
从中可以清楚看出这两种盐的特征有很大的不同。

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\section{参考文献 }
\begin{thebibliography}{99}

%\bibitem{dingtongren} 丁同仁、李承治，常微分方程教程，高等教育出版社，2022年3月第三版。
\bibitem{sishoukui-2} 司守奎,孙玺菁. \emph{Python数学建模算法与应用}, 国防工业出版社. 2022年1月第1版. 
%\bibitem{hexiaoqun-ara} 何晓群. \emph{应用回归分析(R语言版)}. 电子工业出版社. 2017年7月第1版. 
%\bibitem{dalgaard} Peter Dalgaard 著, 郝智恒等译. \emph{R语言统计入门}. 人民邮电出版社. 2014年6月第1版. 

\bibitem{heintz} Nicolas Heintz, Tom Francart, and Alexander Bertrand. \emph{Minimally Informed Linear Discriminant Analysis: training an LDA model with unlabelled data}. Arxiv 2310.11110. 

\bibitem{fisher-LDA} R. Fisher. \emph{The Use of Multiple Measurements in Taxonomic Problems}, Annals of Eugenics, vol. 7, no. 2, pp. 179–188, 9, 1936.

\bibitem{thalles} \url{https://sthalles.github.io/fisher-linear-discriminant/} 

\end{thebibliography}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\end{document}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





